# Loading required R packages
library(ggplot2)
library(plotly)
library(shiny)
library(gridExtra)
library(xlsx)
library(MASS)
library(sf)
library(akima)

Sys.setenv('MAPBOX_TOKEN' = 'pk.eyJ1IjoibGFrc2hpZGFhIiwiYSI6ImNqbWIyOHN2NTRlZ3kzam10aTljeGNybWgifQ.8EG9Y6r024e-TGk79f7GpA')

1 Reading Data

2 Data Mugging

2.0.1 Quantile Computation

get_outliers <- function(x){
  quantile_values = quantile(x, probs = c(0.25, 0.75))
  q1 = quantile_values["25%"]
  q3 = quantile_values["75%"]
  
  return(c(which((x > (q3+1.5*(q3-q1)))), which(x < (q1-1.5*(q3-q1)))))
}

2.0.2 Scaling the Data

baseball_scaled <- scale(baseball_data[,3:length(baseball_data)])

2.0.3 Distance Matrix between rows

distance_matrix <- dist(baseball_scaled)

2.0.4 Non-metric MDS

mds_result <- isoMDS(distance_matrix, k=2, p=2)
## initial  value 19.856833 
## iter   5 value 16.319153
## iter  10 value 16.046215
## final  value 15.935476 
## converged
coords <- mds_result$points
coords_mds <- as.data.frame(coords)
baseball_data_with_mds <- baseball_data
baseball_data_with_mds$MDS_V1 <- coords_mds$V1
baseball_data_with_mds$MDS_V2 <- coords_mds$V2

3 Single Plots

3.1 Density Plot

3.1.1 Density Plot with Outlier Highlight using GGplot2

density_plot_infection_risk = ggplot(senic_data) + 
  ggtitle("Density plot of Infection_Risk")  + 
  geom_density(aes(x=Infection_Risk), fill = "lightblue") + 
  geom_point(data=senic_data[get_outliers(senic_data$Infection_Risk),],
             aes(x=Infection_Risk, y=0, colour="Outliers"), 
             shape=23, size=2, fill="red") +
  scale_color_manual(values = c("darkblue","black")) + 
  labs(colour="Legend") +
  theme(plot.title = element_text(hjust = 0.5), legend.position = "right")

density_plot_infection_risk

3.1.2 Density Plot with Outlier Highlight using Plotly (converting from ggplot2)

x <- ggplotly(p=density_plot_infection_risk)

x

3.2 Histogram Plot

3.2.1 Histogram plot with Outlier Highlight using Plotly

outliers = senic_data[get_outliers(senic_data$Infection_Risk),c("Infection_Risk")]
senic_data$zero = 0

p <- plot_ly(senic_data, x=~Infection_Risk) %>% 
  add_histogram(name="Histogram count") %>% 
  filter(is.element(Infection_Risk, outliers)) %>% 
  add_markers(x=~Infection_Risk,y=~zero, name="Outliers", 
              marker=list(symbol="diamond", size=10, line = list(color="black", width=1))) %>%
  layout(title="Histogram of Infection_Risk", yaxis=list(title="Count"))

p

3.3 Scatter Plot

3.3.1 Simple scatter plot with colour

ggplot(senic_data) + geom_point(aes(x=Number_of_Nurses, y=Infection_Risk, color=Number_of_Beds)) + 
  ggtitle("Scatterplot of Infection_Risk vs Number_of_Nurses") + 
  theme(plot.title = element_text(hjust = 0.5))

3.3.2 Scatter Plot with Discreetization (split a variable into classes)

ggplot(olive_data) + 
  geom_point(aes(x = oleic, y = palmitic, 
                 color=cut_interval(olive_data$linolenic, n = 4))) +
  ggtitle("Dependence of Palmitic vs Oleic vs Linolenic") +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(color = 'Linolenic range') 

3.3.3 Scatter plot size varied

ggplot(olive_data) + geom_point(aes(x = oleic, y = palmitic, size = cut_interval(linolenic, n = 4))) + 
  ggtitle("Dependence of Palmitic vs Oleic vs Linolenic") +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_size_manual(name = "Linolenic range", values = c(1, 2, 3, 4))

3.3.4 Scatter plot angle varied

# Pre-processing - Setting angle values based on category
olive_data$linolenic_class <- cut_interval(olive_data$linolenic, n = 4)
levels(olive_data$linolenic_class) <- (0:3) * (pi/4)
olive_data$linolenic_class <- as.numeric(as.character(olive_data$linolenic_class))

ggplot(olive_data, aes(x=oleic, y=palmitic)) + geom_point() +  
  geom_spoke(aes(angle = olive_data$linolenic_class), radius=40) + 
  ggtitle("Dependence of Palmitic vs Oleic vs Linolenic 
Legend
Orientation angle of spoke : Linolenic class
0:(0,18.5], 45:(18.5,37], 90:(37,55.5], 135:(0,18.5] ") +
  theme(plot.title = element_text(hjust = 0.5)) 

3.3.5 Scatter plot with color, size and shape

ggplot(olive_data)+
  geom_point(aes(x = oleic, y = eicosenoic, color = cut_interval(linoleic, n = 3),
                            shape = cut_interval(palmitic, n = 3),
                            size = cut_interval(palmitoleic, n = 3))) + 
  scale_size_manual(values = c(2,3,4)) +
  labs(shape = "Palmitic range", color = "Linoleic range", size = "Palmitoleic range") +
  ggtitle("Dependence of Oleic vs Eicosenoic vs Linoleic vs Palmitic vs Palmitoleic") +
  theme(plot.title = element_text(hjust = 0.5))

3.3.6 Scatter Plot of MDS

baseball_scaled <- scale(baseball_data[,3:length(baseball_data)])

### Distance Matrix between rows
distance_matrix <- dist(baseball_scaled)

### Non-metric MDS
mds_result <- isoMDS(distance_matrix, k=2, p=2)
## initial  value 19.856833 
## iter   5 value 16.319153
## iter  10 value 16.046215
## final  value 15.935476 
## converged
coords <- mds_result$points
coords_mds <- as.data.frame(coords)
baseball_data_with_mds <- baseball_data
baseball_data_with_mds$MDS_V1 <- coords_mds$V1
baseball_data_with_mds$MDS_V2 <- coords_mds$V2



plot_ly(baseball_data_with_mds, x=~MDS_V1, y=~MDS_V2) %>% 
  add_markers(color=~League, colors = c("blue", "red"), 
              text=baseball_data_with_mds$Team, hoverinfo="text") %>%
  layout(title="Scatterplot of the 2 MDS variables")

3.4 Shepard Plot

3.4.1 Shepard plot shows the fit of MDS, the distance in original dataset vs. distance in reordered dataset should be similar

baseball_scaled <- scale(baseball_data[,3:length(baseball_data)])

### Distance Matrix between rows
distance_matrix <- dist(baseball_scaled)
mds_result <- isoMDS(distance_matrix, k=2, p=2)
## initial  value 19.856833 
## iter   5 value 16.319153
## iter  10 value 16.046215
## final  value 15.935476 
## converged
coords <- mds_result$points

shp <- Shepard(distance_matrix, coords)
delta <- as.numeric(distance_matrix)
D <- as.numeric(dist(coords))

n <- nrow(coords)
index <- matrix(1:n, nrow=n, ncol=n)
index1 <- as.numeric(index[lower.tri(index)])

n <- nrow(coords)
index <- matrix(1:n, nrow=n, ncol=n, byrow = T)
index2 <- as.numeric(index[lower.tri(index)])


plot_ly()%>%
  add_markers(x=~delta, y=~D, name="Observation pairs", hoverinfo = 'text', 
              text = ~paste('Obj 1: ', 
                            rownames(baseball_data_with_mds)[index1], 
                            '<br> Obj 2: ', 
                            rownames(baseball_data_with_mds)[index2])) %>%
  add_lines(x=~shp$x, y=~shp$yf, name="Isotonic regression line") %>%
  layout(title="Shepard's plot of MDS operation")

3.5 Pie Charts

3.5.1 Simple pie chart

plot_ly(olive_data,labels=~Area,type='pie',textinfo = "none") %>%
  layout(title = "Pie chart of proportion of oils coming from different areas")

3.6 2D density contour plot

3.6.1 Simple 2D plot using ggplot2

ggplot(olive_data)+geom_density_2d(aes(x=eicosenoic, y=linoleic)) +
  ggtitle("Contour plot of Linoleic vs Eicosenoic") +
  theme(plot.title = element_text(hjust = 0.5))

4 Multiple Plots

4.1 Density Plots

4.1.1 Density Plot with Outlier Highlight

plot_density_with_outliers <- function(var_data, col_name){
  p <- NULL
  df_data = setNames(data.frame(var_data),col_name)
  if(length(get_outliers(df_data[[col_name]])) > 0){
    p <- ggplot(df_data) + 
      geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue") + 
      geom_point(data=df_data[get_outliers(df_data[[col_name]]),,drop=FALSE],
                 aes_string(x=col_name), y=0, shape=23, size=2, colour="black", fill="red")
  }
  else{
    p <- ggplot(df_data) + 
      ggtitle(paste("")) + 
      theme(plot.title = element_text(hjust = 0.5)) +
      geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue")
  }
  
  return(p)
}

categorical_columns = c("Medical_School_Affiliation", "Region")
ID_columns = c("Identification_Number")
quantitative_columns = setdiff(colnames(senic_data), c(categorical_columns, ID_columns))

plot_list = mapply(plot_density_with_outliers, senic_data[, quantitative_columns], 
                   colnames(senic_data[, quantitative_columns]), SIMPLIFY = FALSE)
plot_matrix <- arrangeGrob(grobs = plot_list, ncol = 2)
grid.arrange(plot_matrix, respect=TRUE, top="Density plots of SENIC data variables")

5 Shiny

#UI component
ui <- fluidPage(
  sliderInput(inputId="bw_value", label="Choose bandwidth size", value=4.5, min=0.1, max=80),
  checkboxGroupInput("selected_variables", "Variables to show: ", quantitative_columns, inline=TRUE),
  plotOutput("densPlot", height = "650px")
)

plot_density_with_outliers_shiny <- function(df_data, col_name, bw){
  p <- NULL
  if(length(get_outliers(senic_data[[col_name]])) > 0){
    p <- ggplot(df_data) + 
      ggtitle(paste("Density plot of ", col_name)) + 
      theme(plot.title = element_text(hjust = 0.5)) + 
      geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue", bw=bw) +
      geom_point(data=df_data[get_outliers(df_data[[col_name]]),],
                 aes_string(x=col_name, y=0), shape=23, size=2, colour="black", fill="red")
  }
  else{
    p <- ggplot(df_data) + 
      ggtitle(paste("Density plot of ", col_name)) + 
      theme(plot.title = element_text(hjust = 0.5)) + 
      geom_density(aes_string(x=col_name), fill = "lightblue", color = "darkblue", bw=bw)
  }
  
  return(p)
}


server <- function(input, output) {
  
  output$densPlot <- renderPlot({
    
    selected_columns = input$selected_variables
    plot_list = vector("list", length(selected_columns))
    
    if(length(selected_columns) > 0){
      for(i in 1:length(selected_columns)){
        plot_list[[i]] = plot_density_with_outliers_shiny(senic_data, selected_columns[i], 
                                                    bw = input$bw_value)
      }
      plot_matrix <- arrangeGrob(grobs = plot_list, ncol = 2)
      grid.arrange(plot_matrix)
    }
    
  })
}

shinyApp(ui = ui, server = server, options = list(width="800px", height="900px"))